/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <sys/types.h>
#include <dirent.h>

#ifdef UDM_GUESSER_STANDALONG
#include <unistd.h>
#endif

#include "udm_config.h"

#include "udm_common.h"
#include "udm_crc32.h"
#include "udm_guesser.h"

#include "udm_utils.h"
#include "udm_unicode.h"
#include "udm_log.h"

/*#define DEBUG_GUESSER*/

int UdmLoadLangMapList(UDM_ENV * Env, const char * mapdir){

	DIR * dir;
	struct dirent * item;
	char fullname[1024]="";
	char name[1024]="";

	dir=opendir(mapdir);
	if(!dir)return 0;

	while((item=readdir(dir))){
		char * tail;
		strcpy(name,item->d_name);
		if((tail=strstr(name,".lm"))){
		        *tail='\0';
			sprintf(fullname,"%s/%s",mapdir,item->d_name);
			UdmLoadLangMapFile(Env, fullname);
		}
	}
	closedir(dir);
	return Env->LangMaps.nmaps;

	return 0;
}



/* Structure to sort guesser results */
typedef struct {
	UDM_LANGMAP * map;
	float quality;
} UDM_MAPSTAT;

static int statcmp(const void * i1, const void * i2){
	float fres;
	fres = ((const UDM_MAPSTAT*)(i2))->quality - ((const UDM_MAPSTAT*)(i1))->quality;
	if(fres<0)return +1;
	if(fres>0)return -1;
	return 0;
}



/***************************************************************/

static int cmpm(const void * i1,const void * i2){
	const UDM_LANGITEM * m1=i1;
	const UDM_LANGITEM * m2=i2;
	
	return(m2->count-m1->count);
}

static void UdmPrintLangMap(UDM_LANGMAP * map){
	size_t i;
	
	printf("#\n");
	printf("#\n");
	printf("#\n");
	printf("\n");
	printf("Language: %s\n",map->lang);
	printf("Charset:  %s\n",map->charset);
	printf("\n");
	printf("\n");
	qsort(map->memb,UDM_LM_HASHMASK+1,sizeof(UDM_LANGITEM),&cmpm);
	for(i=0;i<=UDM_LM_HASHMASK;i++){
		char * s;
		if(!map->memb[i].count)break;
		if(i==500)break;
		
		for(s=map->memb[i].str;*s;s++)
			if(*s==' ')*s='_';
		
		printf("%s\t%d\n",map->memb[i].str,map->memb[i].count);
	}
}

static void usage(void){
	printf("mguesser %s-%s\n\n", PACKAGE, VERSION);
	printf("To guess use:\n\n");
	printf("\tmguesser [-n maxhits]< FILENAME\n\n");
	printf("To create new language map use:\n\n");
	printf("\tmguesser -p -c charset -l language < FILENAME\n");
} 

int main(int argc, char ** argv){
	int ch;
	int verbose=0;
	int print=0;
	int n=1000;
	char buf[1024]="";
	UDM_ENV env;
	UDM_LANGMAP mchar;
	char * charset=NULL;
	char * lang=NULL;

	while((ch=getopt(argc,argv,"pv?c:l:n:"))!=-1){
		switch(ch){
			case 'n':
				n=atoi(optarg);
				break;
			case 'c':
				charset=optarg;
				break;
			case 'l':
				lang=optarg;
				break;
			case 'p':
				print++;
				break;
			case 'v':
				verbose++;
				break;
			case '?':
			default:
				usage();
				exit(1);
		}
	}
	argc-=optind;
	argv+=optind;

	/* Init structures */
	memset(&env,0,sizeof(env));
	memset(&mchar,0,sizeof(mchar));

	if(!print){
		/* Load all available lang ngram maps */
		if(verbose){
			fprintf(stderr,"Loading language maps from '%s'\n", UDM_CONF_DIR UDMSLASHSTR "langmap");
		}
		UdmLoadLangMapList(&env, UDM_CONF_DIR UDMSLASHSTR "langmap");
		if(env.errcode){
			printf("Error: '%s'\n",env.errstr);
			return 1;
		}

		if(verbose){
			fprintf(stderr, "%d maps found\n", env.LangMaps.nmaps);
		}
	}
	
	
	/* Add each STDIN line statistics */
	while(fgets(buf,sizeof(buf),stdin)){
		UdmBuildLangMap(&mchar,buf,strlen(buf), 1);
	}

#ifdef DEBUG_GUESSER	
	{
	  float count0 = 0; int i;
	  for (i = 0; i < UDM_LM_HASHMASK; i++) {
	    if (mchar.memb[i].count == 0) count0 += 1.0;
	  }			
	  fprintf(stderr, "Count 0: %f, %.2f\n", count0, count0 * 100 / UDM_LM_HASHMASK);
	}
#endif

	if(print){
		/* Display built langmap */
		if(!charset){
			fprintf(stderr,"You must specify charset using -c\n");
		}else
		if(!lang){
			fprintf(stderr,"You must specify language using -l\n");
		}else{
			mchar.lang=strdup(lang);
			mchar.charset=strdup(charset);
			UdmPrintLangMap(&mchar);
		}
	}else{
		int i;
		UDM_MAPSTAT * mapstat;
		
		/* Prepare map to comparison */
		UdmPrepareLangMap(&mchar);

		/* Allocate memory for comparison statistics */
		mapstat = (UDM_MAPSTAT *)malloc(env.LangMaps.nmaps * sizeof(UDM_MAPSTAT));

		/* Calculate each lang map        */
		/* correlation with text          */
		/* and store in mapstat structure */

		for(i = 0; i < env.LangMaps.nmaps; i++){
			mapstat[i].quality = UdmCheckLangMap(&env.LangMaps.Map[i], &mchar);
			mapstat[i].map = &env.LangMaps.Map[i];
		}

		/* Sort statistics in quality order */
		qsort(mapstat, env.LangMaps.nmaps, sizeof(UDM_MAPSTAT), &statcmp);


		/* Display results. Best language is shown first. */
		for(i = 0; (i < env.LangMaps.nmaps) && (i < n); i++){
			printf("%.10f\t%s\t%s\n",mapstat[i].quality<0?0:mapstat[i].quality,mapstat[i].map->lang,mapstat[i].map->charset);
		}

		/* Free variables */
		free(mapstat);
	}
	
	UdmLangMapListFree(&env.LangMaps);

	return 0;
}

